Load the library
library(CRMetrics)
## Registered S3 methods overwritten by 'ggpp':
## method from
## heightDetails.titleGrob ggplot2
## widthDetails.titleGrob ggplot2
library(magrittr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(cowplot)
library(ggplot2)
library(qs)
## qs 0.27.2. Announcement: https://github.com/qsbase/qs/issues/103
Load metadata
metadata <- read.table("metadata.tsv", sep = "\t", header = T)
crmc <- CRMetrics$new(data.path = "counts_premrna/",
metadata = metadata,
n.cores = 32)
crmc$selectMetrics()
## no metrics
## 1 1 estimated number of cells
## 2 2 mean reads per cell
## 3 3 median genes per cell
## 4 4 number of reads
## 5 5 valid barcodes
## 6 6 sequencing saturation
## 7 7 q30 bases in barcode
## 8 8 q30 bases in rna read
## 9 9 q30 bases in sample index
## 10 10 q30 bases in umi
## 11 11 reads mapped to genome
## 12 12 reads mapped confidently to genome
## 13 13 reads mapped confidently to intergenic regions
## 14 14 reads mapped confidently to intronic regions
## 15 15 reads mapped confidently to exonic regions
## 16 16 reads mapped confidently to transcriptome
## 17 17 reads mapped antisense to gene
## 18 18 fraction reads in cells
## 19 19 total genes detected
## 20 20 median umi counts per cell
crmc$plotSummaryMetrics(metrics = crmc$selectMetrics(ids = 2))$data$value %>% max
## Using 'sample' for 'comp.group'
## [1] 596564
p <- crmc$prepareCellbender()
## 2024-12-04 22:57:12.993205 Started run using 30 cores
## 2024-12-04 22:57:12.996188 Loading HDF5 Cell Ranger outputs
## 2024-12-04 22:57:13.06574 Loading 30 count matrices using 30 cores
## 2024-12-04 22:59:09.213337 Done!
## 2024-12-04 22:59:09.214395 Using stored UMI counts calculations. To overwrite, set $cellbender$umi.counts <- NULL
## 2024-12-04 22:59:09.214982 Plotting
## 2024-12-04 22:59:09.68895 Done!
p
droplets <- crmc$getTotalDroplets()
droplets
## CTRL_037 CTRL_039 CTRL_09051 CTRL_09055 CTRL_09057 CTRL_09148 CTRL_09155
## 22110 18954 18699 18270 10263 30294 11430
## CTRL_10055 CTRL_1467 CTRL_652 MSA_1352 MSA_1365 MSA_1371 MSA_1375
## 4305 9966 11493 11769 10593 9537 5178
## MSA_1391 MSA_1398 MSA_1406 MSA_1436 PD_6900 PD_7044 PD_7050
## 8610 10434 5418 19383 13344 9663 9549
## PD_7219 PD_7679 PD_7731 PD_7754 PD_7958 PD_8095 PD_K1376
## 13050 3150 11664 9867 10083 1836 11964
## PD_K1411 PD_K1449
## 17889 15852
droplets["CTRL_039"] <- 1.5e4
droplets["CTRL_09051"] <- 2.5e4
droplets["CTRL_09055"] <- 1.5e4
droplets["CTRL_09155"] <- 11430
droplets["CTRL_10055"] <- 1.5e4
droplets["CTRL_1467"] <- 1.5e4
droplets["CTRL_652"] <- 11493
droplets["MSA_1352"] <- 11769
droplets["MSA_1371"] <- 9537
droplets["MSA_1375"] <- 5178
droplets["MSA_1406"] <- 1e4
droplets["PD_7679"] <- 5e3
droplets["PD_7754"] <- 1.5e4
droplets["PD_8095"] <- 3e3
subset <- c("CTRL_039","CTRL_09051","CTRL_09055","CTRL_09155","CTRL_10055","CTRL_1467","CTRL_652","MSA_1352","MSA_1371","MSA_1375","MSA_1406","PD_7679","PD_7754","PD_8095")
crmc$prepareCellbender(samples = subset, total.droplets = droplets[subset], verbose = F)
crmc$saveCellbenderScript(file = "cellbender_script.sh",
total_droplets = droplets)
p <- crmc$plotCbCells(samples = crmc$metadata$sample %>% as.character() %>% .[. != "MSA_1406"]) +
scale_x_discrete(labels = c(paste("CTRL", seq(10)), paste("MSA", seq(7)), paste("PD", seq(12)))) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
p
p <- crmc$plotCbTraining()
p
p <- crmc$plotCbCellProbs() +
theme(line = element_blank(),
axis.ticks.x = element_line())
p
crmc$plotCbAmbExp()
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
p <- crmc$plotCbAmbGenes() +
theme(line = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
axis.ticks.x = element_line())
p
crmc$addDetailedMetrics(cellbender = TRUE)
metrics.to.plot <- crmc$detailed.metrics$metric %>%
unique()
crmc$plotDetailedMetrics(metrics = metrics.to.plot, plot.geom = "violin", comp.group = "flowcell")
In order to plot our cells in a UMAP embedding, we need to perform
preprocessing of the raw count matrices. To do this, either
pagoda2 (default) or Seurat can be used.
crmc$doPreprocessing()
Then, we create the UMAP embedding using conos.
crmc$createEmbedding()
We can now plot our cells.
crmc$plotEmbedding()
We can plot cell depth, both in the UMAP embedding or as histograms for all samples or per sample.
p2a <- crmc$plotEmbedding(depth = TRUE, depth.cutoff = 5e2)
p2a
crmc$plotDepth(cutoff = 5e2)
## Warning: Removed 403 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 403 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 403 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 388 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 388 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 388 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 291 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 290 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 291 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 394 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 394 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 394 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 437 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 437 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 437 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 372 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 371 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 372 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 349 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 349 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 349 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 308 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 306 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 308 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 360 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 359 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 360 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 307 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 306 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 307 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 410 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 410 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 410 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 394 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 393 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 394 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 410 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 410 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 410 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 266 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 266 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 266 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 302 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 302 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 302 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 412 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 412 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 412 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 391 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 391 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 391 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 448 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 447 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 448 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 355 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 355 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 355 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 448 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 448 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 448 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 66 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 64 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 66 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 271 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 271 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 271 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 178 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 175 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 178 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 418 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 418 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 418 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 10 rows containing non-finite outside the scale range (`stat_align()`).
## Removed 10 rows containing non-finite outside the scale range (`stat_align()`).
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 290 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 290 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 290 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 347 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 347 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 347 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 462 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 462 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 462 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 323 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 323 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 323 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 154 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 154 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 154 rows containing missing values or values outside the scale range
## (`geom_line()`).
crmc$detectDoublets(conda.path = "/opt/software/miniconda/4.12.0/condabin/conda")
We can plot the estimated doublets in the UMAP embedding.
p1 <- crmc$plotEmbedding(doublet.method = "scrublet")
p1
And we can plot the scores for the doublet estimations.
crmc$plotEmbedding(doublet.method = "scrublet", doublet.scores = TRUE)
crmc$detectDoublets(conda.path = "/opt/software/miniconda/4.12.0/condabin/conda", method = "doubletdetection")
We can plot the estimated doublets in the UMAP embedding.
p2 <- crmc$plotEmbedding(doublet.method = "doubletdetection")
p2
And we can plot the scores for the doublet estimations.
crmc$plotEmbedding(doublet.method = "doubletdetection", doublet.scores = TRUE)
scrub.res <- crmc$doublets$scrublet$result %>%
select(labels, sample) %>%
mutate(method = "scrublet")
dd.res <- crmc$doublets$doubletdetection$result %>%
select(labels, sample) %>%
mutate(labels = as.logical(labels),
method = "DoubletDetection")
dd.res[is.na(dd.res)] <- FALSE
plot.df <- rbind(scrub.res,
dd.res) %>%
filter(labels) %>%
group_by(sample, method) %>%
summarise(count = n())
## `summarise()` has grouped output by 'sample'. You can override using the
## `.groups` argument.
p5 <- ggplot(plot.df, aes(sample, count, fill = method)) +
geom_bar(stat = "identity", position = position_dodge()) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
legend.position = "bottom") +
labs(x = "", y = "No. doublets", fill = "Method", title = "Doublets per sample") +
scale_fill_manual(values = c("blue","red"))
p5
p4 <- plot.df %>%
group_by(method) %>%
summarise(count = sum(count)) %>%
ggplot(aes(method, count, fill = method)) +
geom_bar(stat = "identity") +
theme_bw() +
guides(fill = "none") +
labs(x = "", y = "No. doublets", title = "Total doublets per method") +
theme(legend.position = "bottom") +
scale_fill_manual(values = c("blue","red"))
p4
plot.vec <- data.frame(scrublet = scrub.res$labels %>% as.numeric(),
doubletdetection = dd.res$labels %>% as.numeric()) %>%
apply(1, \(x) if (x[1] == 0 & x[2] == 0) "Kept" else if (x[1] == 1 & x[2] == 0) "scrublet" else if (x[1] == 0 & x[2] == 1) "DoubletDetection" else "Both") %>%
setNames(rownames(scrub.res)) %>%
factor(levels = c("Kept","scrublet","DoubletDetection","Both"))
p3 <- crmc$con$plotGraph(groups = plot.vec, mark.groups = FALSE, show.legend = TRUE, shuffle.colors = TRUE, title = "Doublets", size = 0.3) +
scale_color_manual(values = c("grey80","red","blue","black")) +
theme(legend.position = "bottom")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
p3
plot.vec.ctrl039 <- plot.vec[grepl("CTRL_039", names(plot.vec))]
p6 <- crmc$con$plotGraph(groups = plot.vec.ctrl039, plot.na = FALSE, mark.groups = FALSE, show.legend = TRUE, shuffle.colors = TRUE, title = "Doublets, CTRL_039") +
scale_color_manual(values = c("grey80","blue","red","black")) +
theme(legend.position = "bottom")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
p6
plot.vec.pd7044 <- plot.vec[grepl("PD_7044", names(plot.vec))]
p7 <- crmc$con$plotGraph(groups = plot.vec.pd7044, plot.na = FALSE, mark.groups = FALSE, show.legend = TRUE, shuffle.colors = TRUE, title = "Doublets, PD_7044") +
scale_color_manual(values = c("grey80","red","blue","black")) +
theme(legend.position = "bottom")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
p7
plot_grid(plotlist = list(p1,p2,p3,p4,p5,p6,p7), ncol = 3)
We can also investigate the mitochondrial fraction in our cells
p1a <- crmc$plotEmbedding(mito.frac = TRUE)
p1a
We can plot all the cells to be filtered in the UMAP embedding
crmc$plotFilteredCells(type = "embedding", depth.cutoff = 5e2, doublet_method = "scrublet")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
p3a <- crmc$plotFilteredCells(type = "embedding", depth.cutoff = 5e2, doublet_method = "doubletdetection") +
theme(legend.position = "bottom")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
p3a
And we can plot the cells to be filtered per sample
crmc$plotFilteredCells(type = "bar", doublet_method = "scrublet", depth.cutoff = 5e2)
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
p4a <- crmc$plotFilteredCells(type = "bar", doublet.method = "doubletdetection", depth.cutoff = 5e2) +
theme(legend.position = "bottom")
p4a
## Warning: ggrepel: 22 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot_grid(plotlist = list(p1a,p2a,p3a,p4a), ncol = 2)
## Warning: ggrepel: 79 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
We can also extract the raw numbers for plotting in other ways than those included here
filter.data <- crmc$plotFilteredCells(type = "export", doublet.method = "doubletdetection")
filter.data %>% head()
## # A tibble: 6 × 4
## sample cell variable value
## <chr> <chr> <chr> <dbl>
## 1 CTRL_037 CTRL_037!!GCACATATCCTAGCCT-1 depth 0
## 2 CTRL_037 CTRL_037!!GCACATATCCTAGCCT-1 mito 0
## 3 CTRL_037 CTRL_037!!GCACATATCCTAGCCT-1 doublets 0
## 4 CTRL_037 CTRL_037!!GATAGCTCATGTTTGG-1 depth 0
## 5 CTRL_037 CTRL_037!!GATAGCTCATGTTTGG-1 mito 0
## 6 CTRL_037 CTRL_037!!GATAGCTCATGTTTGG-1 doublets 0
Finally, we can filter the count matrices and save them for downstream applications.
crmc$filterCms(prefix = "cms_filtered",
method = "qs",
depth.cutoff = 5e2,
mito.cutoff = 0.05,
doublets = "doubletdetection",
samples.to.exclude = "MSA_1406",
n.cores = 100)